dFtraining <- read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dFtesting <- read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dim(dFtraining); dim(dFtesting)
## [1] 19622 160
## [1] 20 160
set.seed(101)
iNtraining <- createDataPartition(dFtraining$classe, p = 0.8, list = F)
dFvalue <- dFtraining[-iNtraining,]
dFtraining <- dFtraining[iNtraining,]
dim(dFtraining); dim(dFvalue)
## [1] 15699 160
## [1] 3923 160
table(dFtraining$classe)/nrow(dFtraining)
##
## A B C D E
## 0.2843493 0.1935155 0.1744060 0.1638958 0.1838334
beltmissing <- sapply(select(dFtraining,names(dFtraining)[grepl("_belt",names(dFtraining))]),
function(x) sum(is.na(x)))
beltmissing
## roll_belt pitch_belt yaw_belt
## 0 0 0
## total_accel_belt kurtosis_roll_belt kurtosis_picth_belt
## 0 15396 15413
## kurtosis_yaw_belt skewness_roll_belt skewness_roll_belt.1
## 15699 15395 15413
## skewness_yaw_belt max_roll_belt max_picth_belt
## 15699 15388 15388
## max_yaw_belt min_roll_belt min_pitch_belt
## 15396 15388 15388
## min_yaw_belt amplitude_roll_belt amplitude_pitch_belt
## 15396 15388 15388
## amplitude_yaw_belt var_total_accel_belt avg_roll_belt
## 15396 15388 15388
## stddev_roll_belt var_roll_belt avg_pitch_belt
## 15388 15388 15388
## stddev_pitch_belt var_pitch_belt avg_yaw_belt
## 15388 15388 15388
## stddev_yaw_belt var_yaw_belt gyros_belt_x
## 15388 15388 0
## gyros_belt_y gyros_belt_z accel_belt_x
## 0 0 0
## accel_belt_y accel_belt_z magnet_belt_x
## 0 0 0
## magnet_belt_y magnet_belt_z
## 0 0
armmissing <- sapply(select(dFtraining,names(dFtraining)[grepl("_arm",names(dFtraining))]),
function(x) sum(is.na(x)))
armmissing
## roll_arm pitch_arm yaw_arm total_accel_arm
## 0 0 0 0
## var_accel_arm avg_roll_arm stddev_roll_arm var_roll_arm
## 15388 15388 15388 15388
## avg_pitch_arm stddev_pitch_arm var_pitch_arm avg_yaw_arm
## 15388 15388 15388 15388
## stddev_yaw_arm var_yaw_arm gyros_arm_x gyros_arm_y
## 15388 15388 0 0
## gyros_arm_z accel_arm_x accel_arm_y accel_arm_z
## 0 0 0 0
## magnet_arm_x magnet_arm_y magnet_arm_z kurtosis_roll_arm
## 0 0 0 15446
## kurtosis_picth_arm kurtosis_yaw_arm skewness_roll_arm skewness_pitch_arm
## 15448 15398 15445 15448
## skewness_yaw_arm max_roll_arm max_picth_arm max_yaw_arm
## 15398 15388 15388 15388
## min_roll_arm min_pitch_arm min_yaw_arm amplitude_roll_arm
## 15388 15388 15388 15388
## amplitude_pitch_arm amplitude_yaw_arm
## 15388 15388
for_ear_miss <- sapply(select(dFtraining,
names(dFtraining)[grepl("_forearm",names(dFtraining))]),
function(x) sum(is.na(x)))
for_ear_miss
## roll_forearm pitch_forearm yaw_forearm
## 0 0 0
## kurtosis_roll_forearm kurtosis_picth_forearm kurtosis_yaw_forearm
## 15448 15449 15699
## skewness_roll_forearm skewness_pitch_forearm skewness_yaw_forearm
## 15447 15449 15699
## max_roll_forearm max_picth_forearm max_yaw_forearm
## 15388 15388 15448
## min_roll_forearm min_pitch_forearm min_yaw_forearm
## 15388 15388 15448
## amplitude_roll_forearm amplitude_pitch_forearm amplitude_yaw_forearm
## 15388 15388 15448
## total_accel_forearm var_accel_forearm avg_roll_forearm
## 0 15388 15388
## stddev_roll_forearm var_roll_forearm avg_pitch_forearm
## 15388 15388 15388
## stddev_pitch_forearm var_pitch_forearm avg_yaw_forearm
## 15388 15388 15388
## stddev_yaw_forearm var_yaw_forearm gyros_forearm_x
## 15388 15388 0
## gyros_forearm_y gyros_forearm_z accel_forearm_x
## 0 0 0
## accel_forearm_y accel_forearm_z magnet_forearm_x
## 0 0 0
## magnet_forearm_y magnet_forearm_z
## 0 0
Dumb_Bell_Missing <- sapply(select(dFtraining,
names(dFtraining)[grepl("_dumbbell",names(dFtraining))]),
function(x) sum(is.na(x)))
Dumb_Bell_Missing
## roll_dumbbell pitch_dumbbell yaw_dumbbell
## 0 0 0
## kurtosis_roll_dumbbell kurtosis_picth_dumbbell kurtosis_yaw_dumbbell
## 15392 15390 15699
## skewness_roll_dumbbell skewness_pitch_dumbbell skewness_yaw_dumbbell
## 15391 15389 15699
## max_roll_dumbbell max_picth_dumbbell max_yaw_dumbbell
## 15388 15388 15392
## min_roll_dumbbell min_pitch_dumbbell min_yaw_dumbbell
## 15388 15388 15392
## amplitude_roll_dumbbell amplitude_pitch_dumbbell amplitude_yaw_dumbbell
## 15388 15388 15392
## total_accel_dumbbell var_accel_dumbbell avg_roll_dumbbell
## 0 15388 15388
## stddev_roll_dumbbell var_roll_dumbbell avg_pitch_dumbbell
## 15388 15388 15388
## stddev_pitch_dumbbell var_pitch_dumbbell avg_yaw_dumbbell
## 15388 15388 15388
## stddev_yaw_dumbbell var_yaw_dumbbell gyros_dumbbell_x
## 15388 15388 0
## gyros_dumbbell_y gyros_dumbbell_z accel_dumbbell_x
## 0 0 0
## accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x
## 0 0 0
## magnet_dumbbell_y magnet_dumbbell_z
## 0 0
Col_2Dr <- c(names(beltmissing[beltmissing != 0]),
names(armmissing[armmissing != 0]),
names(for_ear_miss[for_ear_miss != 0]),
names(Dumb_Bell_Missing[Dumb_Bell_Missing != 0]))
length(Col_2Dr)
## [1] 100
dF_anly <- tbl_df(dFtraining %>%
select(-Col_2Dr,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(Col_2Dr)` instead of `Col_2Dr` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dF_anly$classe <- as.factor(dF_anly$classe)
dF_anly[,1:52] <- lapply(dF_anly[,1:52],as.numeric)
dim(dF_anly)
## [1] 15699 53
corres_Column <- cor(select(dF_anly, -classe))
diag(corres_Column) <- 0
corres_Column <- which(abs(corres_Column)>0.8,arr.ind = T)
corres_Column <- unique(row.names(corres_Column))
corrplot(cor(select(dF_anly,corres_Column)),
type="upper", order="hclust",method = "number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(corres_Column)` instead of `corres_Column` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

#correlationfunnel website: https://business-science.github.io/correlationfunnel/
corr_funl_df <- dF_anly %>% binarize(n_bins = 4, thresh_infreq = 0.01)
Corres_a <- corr_funl_df %>% correlate(target = classe__A)
Corres_a %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Corres_b <- corr_funl_df %>% correlate(target = classe__B)
Corres_b %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Corres_c <- corr_funl_df %>% correlate(target = classe__C)
Corres_c %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
Corres_d <- corr_funl_df %>% correlate(target = classe__D)
Corres_d %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
CorreS_e <- corr_funl_df %>% correlate(target = classe__E)
CorreS_e %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
#subseting dF_anly
Colum_a <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y",
"roll_forearm", "gyros_dumbbell_y")
Colum_b <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" ,
"magnet_belt_y" , "accel_dumbbell_x" )
Colum_c <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" ,
"magnet_dumbbell_x", "magnet_dumbbell_z")
Colum_d <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
"accel_dumbbell_y", "accel_forearm_x")
Colum_e <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt",
"gyros_belt_z" , "magnet_dumbbell_y")
FinaL_Colum <- character()
for(c in c(Colum_a,Colum_b,Colum_c,Colum_d,Colum_e)){
FinaL_Colum <- union(FinaL_Colum, c)
}
dF_AnLy_2 <- dF_anly %>% select(FinaL_Colum, classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(FinaL_Colum)` instead of `FinaL_Colum` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",FinaL_Colum)),
"forearm" = sum(grepl("_forearm",FinaL_Colum)),
"belt" = sum(grepl("_belt",FinaL_Colum)),
"dumbbell" = sum(grepl("_dumbbell",FinaL_Colum)))
## arm forearm belt dumbbell
## 1 2 4 4 7
MyDenS <- function(data, mapping, ...) {
ggplot(data = data, mapping=mapping) +
geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2")
}
MyPoinT <- function(data, mapping, ...) {
ggplot(data = data, mapping=mapping) +
geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2")
}
ggpairs(dF_AnLy_2, columns = 1:5,aes(color = classe),
lower = list(continuous = MyPoinT),diag = list(continuous = MyDenS))

ggpairs(dF_AnLy_2, columns = 6:10,aes(color = classe),
lower = list(continuous = MyPoinT),diag = list(continuous = MyDenS))

ggpairs(dF_AnLy_2, columns = 11:17,aes(color = classe),
lower = list(continuous = MyPoinT),diag = list(continuous = MyDenS))

dF_TrainF <- dFtraining %>% select(FinaL_Colum,classe)
dF_ValuF <- dFvalue %>% select(FinaL_Colum,classe)
dF_TrainF[,1:17] <- sapply(dF_TrainF[,1:17],as.numeric)
dF_ValuF[,1:17] <- sapply(dF_ValuF[,1:17],as.numeric)
levels <- c("A", "B", "C", "D", "E")
Prep_RopObj <- preProcess(dF_TrainF[,-18],method = c("center","scale","BoxCox"))
X_Tain <- predict(Prep_RopObj,select(dF_TrainF,-classe))
Y_Tain <- factor(dF_TrainF$classe,levels=levels)
X_vaL <- predict(Prep_RopObj,select(dF_ValuF,-classe))
Y_vaL <- factor(dF_ValuF$classe,levels=levels)
trControl <- trainControl(method="cv", number=5)
CT_Modl <- train(x = X_Tain,y = Y_Tain,
method = "rpart", trControl = trControl)
RF_Modl<- train(x = X_Tain,y = Y_Tain,
method = "rf", trControl = trControl,verbose=FALSE, metric = "Accuracy")
GBM_Modl <- train(x = X_Tain,y = Y_Tain,
method = "gbm",trControl=trControl, verbose=FALSE)
SVM_Modl <- svm(x = X_Tain,y = Y_Tain,
kernel = "polynomial", cost = 10)
confusionMatrix(predict(CT_Modl,X_vaL),Y_vaL)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1003 330 319 294 106
## B 19 256 20 109 103
## C 93 173 345 240 212
## D 0 0 0 0 0
## E 1 0 0 0 300
##
## Overall Statistics
##
## Accuracy : 0.4853
## 95% CI : (0.4696, 0.5011)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3271
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8987 0.33729 0.50439 0.0000 0.41609
## Specificity 0.6263 0.92067 0.77833 1.0000 0.99969
## Pos Pred Value 0.4888 0.50493 0.32455 NaN 0.99668
## Neg Pred Value 0.9396 0.85275 0.88147 0.8361 0.88377
## Prevalence 0.2845 0.19347 0.17436 0.1639 0.18379
## Detection Rate 0.2557 0.06526 0.08794 0.0000 0.07647
## Detection Prevalence 0.5231 0.12924 0.27097 0.0000 0.07673
## Balanced Accuracy 0.7625 0.62898 0.64136 0.5000 0.70789
confusionMatrix(predict(RF_Modl,X_vaL),Y_vaL)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1112 7 0 0 0
## B 3 741 5 3 1
## C 1 7 676 15 4
## D 0 4 3 625 1
## E 0 0 0 0 715
##
## Overall Statistics
##
## Accuracy : 0.9862
## 95% CI : (0.9821, 0.9896)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9826
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9964 0.9763 0.9883 0.9720 0.9917
## Specificity 0.9975 0.9962 0.9917 0.9976 1.0000
## Pos Pred Value 0.9937 0.9841 0.9616 0.9874 1.0000
## Neg Pred Value 0.9986 0.9943 0.9975 0.9945 0.9981
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2835 0.1889 0.1723 0.1593 0.1823
## Detection Prevalence 0.2852 0.1919 0.1792 0.1614 0.1823
## Balanced Accuracy 0.9970 0.9862 0.9900 0.9848 0.9958
plot(RF_Modl$finalModel,main="Error VS no of tree")

confusionMatrix(predict(GBM_Modl,X_vaL),Y_vaL)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1083 40 2 5 3
## B 18 642 32 16 11
## C 8 54 635 39 10
## D 4 21 14 582 9
## E 3 2 1 1 688
##
## Overall Statistics
##
## Accuracy : 0.9253
## 95% CI : (0.9166, 0.9333)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9055
##
## Mcnemar's Test P-Value : 2.509e-07
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9704 0.8458 0.9284 0.9051 0.9542
## Specificity 0.9822 0.9757 0.9657 0.9854 0.9978
## Pos Pred Value 0.9559 0.8929 0.8512 0.9238 0.9899
## Neg Pred Value 0.9882 0.9635 0.9846 0.9815 0.9898
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2761 0.1637 0.1619 0.1484 0.1754
## Detection Prevalence 0.2888 0.1833 0.1902 0.1606 0.1772
## Balanced Accuracy 0.9763 0.9108 0.9470 0.9452 0.9760
confusionMatrix(predict(SVM_Modl,X_vaL),Y_vaL)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1096 40 18 17 2
## B 1 676 15 5 6
## C 9 40 640 45 3
## D 10 3 9 575 9
## E 0 0 2 1 701
##
## Overall Statistics
##
## Accuracy : 0.9401
## 95% CI : (0.9322, 0.9473)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9241
##
## Mcnemar's Test P-Value : 1.808e-15
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9821 0.8906 0.9357 0.8942 0.9723
## Specificity 0.9726 0.9915 0.9701 0.9905 0.9991
## Pos Pred Value 0.9344 0.9616 0.8684 0.9488 0.9957
## Neg Pred Value 0.9927 0.9742 0.9862 0.9795 0.9938
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2794 0.1723 0.1631 0.1466 0.1787
## Detection Prevalence 0.2990 0.1792 0.1879 0.1545 0.1795
## Balanced Accuracy 0.9773 0.9411 0.9529 0.9424 0.9857
Test2_dF <- dFtesting %>% select(FinaL_Colum,problem_id)
Test_x <- Test2_dF %>% select(FinaL_Colum)
Res_ult <- data.frame("problem_id" = dFtesting$problem_id,
"PREDICTION_RF" = predict(RF_Modl,Test_x),
"PREDICTION_GBM" = predict(GBM_Modl,Test_x),
"PREDICTION_SVM" = predict(SVM_Modl,Test_x))
Res_ult
## problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1 1 E E C
## 2 2 A E A
## 3 3 A D B
## 4 4 E E C
## 5 5 E E A
## 6 6 E D C
## 7 7 E E B
## 8 8 B D A
## 9 9 A B E
## 10 10 E E E
## 11 11 A E C
## 12 12 A D C
## 13 13 E B E
## 14 14 A D B
## 15 15 E E B
## 16 16 E E A
## 17 17 E E C
## 18 18 B E A
## 19 19 E E A
## 20 20 E E E
dF_TrainF2 <- tbl_df(dFtraining %>%
select(-Col_2Dr,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
X_Tain2 <- dF_TrainF2 %>% select(-classe)
X_Tain2 <- sapply(X_Tain2,as.numeric)
Y_Tain2 <- factor(dF_TrainF2$classe,levels=levels)
dF_ValuF2 <- tbl_df(dFvalue %>%
select(-Col_2Dr,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
X_vaL2 <- dF_ValuF2 %>% select(-classe)
X_vaL2 <- sapply(X_vaL2,as.numeric)
Y_vaL2 <- factor(dF_ValuF2$classe,levels=levels)
F2_Test_dF <- tbl_df(dFtesting %>%
select(-Col_2Dr,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
X_2tesT <- F2_Test_dF %>% select(-problem_id)
X_2tesT <- sapply(X_2tesT,as.numeric)
ID_pb <- dF_ValuF2$classe
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
Cores_N <- makeCluster(detectCores() - 1)
registerDoParallel(cores=Cores_N)
getDoParWorkers()
## [1] 3
RF2_Modl <- train(x = X_Tain2,y = Y_Tain2, method = "rf",
metric = "Accuracy",
trControl=trainControl(method = "cv", number = 4,
p= 0.60, allowParallel = TRUE ))
Res_ult2 <- data.frame("problem_id" = dFtesting$problem_id,
"PREDICTION_RF" = predict(RF_Modl,Test_x),
"PREDICTION_GBM" = predict(GBM_Modl,Test_x),
"PREDICTION_SVM" = predict(SVM_Modl,Test_x),
"PREDICTION_RF2_ALL_COL"=predict(RF2_Modl,X_2tesT))
Res_ult2
## problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1 1 E E C
## 2 2 A E A
## 3 3 A D B
## 4 4 E E C
## 5 5 E E A
## 6 6 E D C
## 7 7 E E B
## 8 8 B D A
## 9 9 A B E
## 10 10 E E E
## 11 11 A E C
## 12 12 A D C
## 13 13 E B E
## 14 14 A D B
## 15 15 E E B
## 16 16 E E A
## 17 17 E E C
## 18 18 B E A
## 19 19 E E A
## 20 20 E E E
## PREDICTION_RF2_ALL_COL
## 1 B
## 2 A
## 3 B
## 4 A
## 5 A
## 6 E
## 7 D
## 8 B
## 9 A
## 10 A
## 11 B
## 12 C
## 13 B
## 14 A
## 15 E
## 16 E
## 17 A
## 18 B
## 19 B
## 20 B